IODS week 4: Clustering

Let’s load the data and look at it’s dimensions and structure:

## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
## [1] 506  14

So it appears, that the data has information on 14 variables from 506 individual observations.

library(ggplot2)
library(GGally)

ggpairs(Boston[,!colnames(Boston) %in% c('chas', 'rad')])

# Lets print the summaries without chas and ras which were classes.

summary(Boston[,!colnames(Boston) %in% c('chas', 'rad')])
##       crim                zn             indus            nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.3850  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :0.8710  
##        rm             age              dis              tax       
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   :187.0  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.:279.0  
##  Median :6.208   Median : 77.50   Median : 3.207   Median :330.0  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   :408.2  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:666.0  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :711.0  
##     ptratio          black            lstat            medv      
##  Min.   :12.60   Min.   :  0.32   Min.   : 1.73   Min.   : 5.00  
##  1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95   1st Qu.:17.02  
##  Median :19.05   Median :391.44   Median :11.36   Median :21.20  
##  Mean   :18.46   Mean   :356.67   Mean   :12.65   Mean   :22.53  
##  3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :22.00   Max.   :396.90   Max.   :37.97   Max.   :50.00
table(Boston$chas)
## 
##   0   1 
## 471  35
table(Boston$rad)
## 
##   1   2   3   4   5   6   7   8  24 
##  20  24  38 110 115  26  17  24 132

Jänniä yhteyksiä.

boston_s <- as.data.frame(scale(Boston))

summary(boston_s)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865
boston_s$crime <- cut(boston_s$crim, breaks = quantile(boston_s$crim), include.lowest = TRUE, labels = c("low", "med_low", "med_high", "high"))

boston_s <- dplyr::select(boston_s, -crim)

selection <- sample(x = 1:nrow(boston_s), size = round(0.8*nrow(boston_s)))

train <- boston_s[selection, ]

test <- boston_s[-selection, ]
lda.fit <- lda(crime ~ ., data = train)

lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col=color, length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col=color, pos=3)
}

# target classes as numeric
classes <- as.numeric(train$crime)

# plot the lda results
plot(lda.fit, dimen = 2, col = classes , pch = classes)
lda.arrows(lda.fit, myscale = 1)

crimecat_actual <- test$crime
test <- dplyr::select(test, -crime)

lda.pred <- predict(lda.fit, newdata = test)

# cross tabulate the results
table(correct = crimecat_actual, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       17       6        0    0
##   med_low    5      17        9    0
##   med_high   2       5       18    1
##   high       0       0        0   21

jännä

# k-means clustering
set.seed(777)

bst <- as.data.frame(scale(Boston))

km_5 <- kmeans(bst, centers = 5)

lda.fit2 <- lda(km_5$cluster ~ ., data = bst)

# target classes as numeric
class <- as.numeric(km_5$cluster)

# plot the lda results
plot(lda.fit2, dimen = 2, col = class , pch = class)
lda.arrows(lda.fit2, myscale = 1)

model_predictors <- dplyr::select(train, -crime)

# check the dimensions
dim(model_predictors)
## [1] 405  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication
matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)


plotly::plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime)
#plotly::plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km$cluster)